# Scrapy settings for ImageSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'ImageSpider'

SPIDER_MODULES = ['ImageSpider.spiders']
NEWSPIDER_MODULE = 'ImageSpider.spiders'
TELNETCONSOLE_ENABLED=False

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ImageSpider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  'Accept-Language': 'zh-CN,zh;q=0.9',
    'Accept-Encoding':'gzip, deflate, br',
    'Cookie':'szordlastsearchtime=1626413471; Hm_lvt_27618d67972a4cffd2be76380ef187c1=1626413473,1626422413; Hm_lpvt_27618d67972a4cffd2be76380ef187c1=1626426812',
    'Host':'www.msgao.com',
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'ImageSpider.middlewares.ImageSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   #'ImageSpider.middlewares.WebDriverMiddleware': 222,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'ImageSpider.pipelines.ImageFilePipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import os
project_dir = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir, 'gril_image')
# 过期天数
IMAGES_EXPIRES = 10  # 90天内抓取的都不会被重抓

AUTOTHROTTLE_ENABLED=True
AUTOTHROTTLE_START_DELAY=1
AUTOTHROTTLE_MAX_DELAY = 30

#开启本地缓存
# HTTPCACHE_ENABLED = True
# #将http缓存延迟时间
# HTTPCACHE_EXPIRATION_SECS = 1
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 确保所有的爬虫实例使用Redis进行重复过滤
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 将Requests队列持久化到Redis，可支持暂停或重启爬虫
SCHEDULER_PERSIST = True

# Requests的调度策略，默认优先级队列
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'

REDIS_HOST = '地址'
REDIS_PORT = 6379
REDIS_PARAMS = {
    'password': '密码',#密码
    'db':0#选择redis库
}
REDIS_URL='redis://:密码@地址:端口'


SELENIUM_DRIVER_NAME = 'chrome' #浏览器driver名字
SELENIUM_DRIVER_EXECUTABLE_PATH = os.path.join(project_dir, 'driver/chromedriver.exe') #浏览器driver的位置,提供的chromedriver.exe针对89.x版本
# http://chromedriver.storage.googleapis.com/index.html 其他版本可以这下载
#chrome浏览器的参数
SELENIUM_DRIVER_ARGUMENTS=['--headless', '--no-sandbox', '--disable-gpu']

DOWNLOAD_FAIL_ON_DATALOSS=False